In [19]:
import matplotlib.pyplot as plt
import networkx as nx

from collections import Counter
from gensim.models.phrases import Phrases, Phraser
from hvectorspaces.io import PostgresClient

In [86]:
list(range(1930,2030,10))

[1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]

In [87]:
fields = [
    "title", 
    "publication_year", 
    "domain", 
    "field", 
    "topic",
    "abstract"
]

data_per_decade = list()
decades = list(range(1920,2030,10))

with PostgresClient() as client:
    for d in decades:
        citation_map = client.fetch_per_decade_data(d, additional_fields=fields)
        id_to_cited_ids = {}
        id_to_attrs = {} 
        for oa_id, refs, title, publication_year, domain, field, topic, abstract in citation_map:
            id_to_cited_ids[oa_id] = refs
            id_to_attrs[oa_id] = {
                "title": title,
                "publication_year": publication_year,
                "domain": domain,
                "field": field,
                "topic": topic,
                "abstract" : abstract
            }
        data_per_decade.append(id_to_attrs)

In [88]:
for d in data_per_decade:
    print(len(d))

10
32
63
410
1534
3462
6601
20160
82622
161255
60036


In [91]:
from itertools import islice

for k, v in islice(data_per_decade[-1].items(), 2):
    print(k, v)

W1019189097 {'title': 'canonical correlation forests', 'publication_year': 2022, 'domain': 'Physical Sciences', 'field': 'Computer Science', 'topic': 'Neural Networks and Applications', 'abstract': 'We introduce canonical correlation forests (CCFs), a new decision tree ensemble method for classification and regression. Individual canonical correlation trees are binary decision trees with hyperplane splits based on local canonical correlation coefficients calculated during training. Unlike axis-aligned alternatives, the decision surfaces of CCFs are not restricted to the coordinate system of the inputs features and therefore more naturally represent data with correlated inputs. CCFs naturally accommodate multiple outputs, provide a similar computational complexity to random forests, and inherit their impressive robustness to the choice of input parameters. As part of the CCF training algorithm, we also introduce projection bootstrapping, a novel alternative to bagging for oblique decisi

In [34]:
def merge_title_and_abstract(entry:dict[str,str]):
    if isinstance(entry['abstract'], str):
        if isinstance(entry['title'], str):
            return entry['title'] + "\n" + entry['abstract']
        else:
            return entry['abstract']
    elif isinstance(entry['title'], str): 
        return entry['title']
    else:
        return None

In [102]:
import re
import unicodedata

In [90]:
def clean_openalex_text(text: str) -> str:
    # Unicode normalization
    text = unicodedata.normalize("NFKC", text)

    # Remove zero-width and BOM chars
    text = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", text)

    # Remove LaTeX math blocks
    text = re.sub(r"\$[^$]+\$", " ", text)

    # Remove LaTeX commands (\alpha, \mathbb{R}, etc.)
    text = re.sub(r"\\[a-zA-Z]+(\{[^}]*\})?", " ", text)

    # Replace math operators & symbols with space
    text = re.sub(r"[∑∂≈⊗⊕≤≥≠∞√±×÷]", " ", text)

    # Collapse repeated punctuation
    text = re.sub(r"([=+\-*_]){2,}", " ", text)

    # Keep only letters, numbers, basic punctuation
    text = re.sub(r"[^0-9A-Za-zÀ-ÿ.,;:()\-\s]", " ", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [149]:
corpus_per_decade = list()

for idx, d in enumerate(data_per_decade):
    txts = [merge_title_and_abstract(v) for v in d.values()]
    txts = [clean_openalex_text(entry.lower()) for entry in txts if isinstance(entry, str)]
    txts_split = [entry.split(" ") for entry in txts if isinstance(entry, str)]
    corpus_per_decade.append(txts_split)
    print(decades[idx])
    print(str(len(txts_split)) + "\n")

1920
10

1930
32

1940
63

1950
410

1960
1534

1970
3462

1980
6601

1990
20156

2000
82605

2010
161251

2020
60036



In [106]:
corpus = [txt for txts in corpus_per_decade for txt in txts]

In [121]:
corpus[-1][:10]

['edge',
 'label',
 'inference',
 'in',
 'generalized',
 'stochastic',
 'block',
 'models:',
 'from',
 'spectral']

In [123]:
%%time

#corpus: List[List[str]]
bigram = Phrases(
    corpus,
    min_count=5,
    threshold=10,
    delimiter="_",   # <-- str, not bytes
)
bigram_phraser = Phraser(bigram)

corpus_bi = [bigram_phraser[doc] for doc in corpus]

trigram = Phrases(
    corpus_bi,
    min_count=5,
    threshold=10,
    delimiter="_",
)
trigram_phraser = Phraser(trigram)

corpus_tri = [trigram_phraser[doc] for doc in corpus_bi]

CPU times: user 1min 23s, sys: 665 ms, total: 1min 24s
Wall time: 1min 24s


In [126]:
corpus_bi[-1][:10]

['edge',
 'label',
 'inference',
 'in',
 'generalized',
 'stochastic_block',
 'models:',
 'from',
 'spectral',
 'theory']

In [127]:
corpus_tri[-1][:10]

['edge',
 'label',
 'inference',
 'in',
 'generalized',
 'stochastic_block',
 'models:',
 'from',
 'spectral',
 'theory']

In [134]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.utils import simple_preprocess
from pathlib import Path
import multiprocessing as mp

# ---------- (A) Optional: write your tokenized docs to disk (recommended if big) ----------
def write_corpus(tokens_iter, out_path: str):
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for doc in tokens_iter:
            if not doc:
                continue
            f.write(" ".join(doc) + "\n")

# If you already have corpus_tri in memory, uncomment:
write_corpus(corpus_tri, "../data/openalex_phrased.txt")

# ---------- (B) Stream sentences from disk (memory-safe) ----------
sentences = LineSentence("../data/openalex_phrased.txt")  # one tokenized doc per line

In [137]:
import time
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    def __init__(self, total_epochs):
        self.total_epochs = total_epochs
        self.epoch = 0
        self.start_time = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        elapsed = time.time() - self.start_time
        avg_epoch = elapsed / self.epoch
        remaining = avg_epoch * (self.total_epochs - self.epoch)

        print(
            f"Epoch {self.epoch}/{self.total_epochs} | "
            f"elapsed: {elapsed/60:.1f} min | "
            f"ETA: {remaining/60:.1f} min"
        )


In [None]:
# ---------- (C) Train word2vec ----------
workers = max(1, mp.cpu_count() - 1)
epochs = 10

w2v = Word2Vec(
    sentences=sentences,
    vector_size=300,      # 100–300 common; 300 is a solid baseline
    window=5,             # try 5–10 for scholarly text
    min_count=10,         # raise if huge corpus; lower if small
    sg=1,                 # 1=skip-gram (often better); 0=CBOW (faster)
    negative=10,
    sample=1e-4,
    epochs=epochs,
    workers=workers,
    callbacks=[EpochLogger(epochs)],
)

# ---------- (D) Save ----------
out_dir = Path("../models/w2v_openalex_baseline")
out_dir.mkdir(parents=True, exist_ok=True)

w2v.save(out_dir / "word2vec.model")                        # full model
w2v.wv.save(out_dir / "word2vec.kv")                        # keyed vectors (recommended)
w2v.wv.save_word2vec_format(str(out_dir / "word2vec.vec"), binary=False) # text vecs (portable)

In [146]:
# ---------- (E) Quick sanity checks ----------
for q in ["vector_space", "hilbert_space", "banach_space", "embedding_space"]:
    if q in w2v.wv:
        print(q, "→", w2v.wv.most_similar(q, topn=20))
        print("\n")
    else:
        print(q, "not in vocab")

vector_space → [('vector_spaces', 0.6354857087135315), ('vector-space', 0.6096470355987549), ('semantics:', 0.5951888561248779), ('distributional_word', 0.5766538381576538), ('information_retrieval:', 0.5727638006210327), ('article_free_access_share', 0.5510422587394714), ('arabic_text', 0.5459734201431274), ('contextualized_word', 0.5454588532447815), ('locally_convex_topological', 0.544205904006958), ('random_indexing', 0.5427467823028564), ('multi-word', 0.5399446487426758), ('word_vector', 0.5397936105728149), ('compositional_distributional', 0.5382021069526672), ('document_indexing', 0.5368563532829285), ('cross-language_information_retrieval', 0.535944938659668), ('embeddings:', 0.5334151983261108), ('handwritten_text', 0.5328021049499512), ('formal_concept', 0.5313637256622314), ('relation_paths', 0.528424084186554), ('an_ontology-based', 0.5275920629501343)]


hilbert_space → [('hilbert_space,', 0.6895403861999512), ('hilbert_space.', 0.6610992550849915), ('hilbert_spaces.', 0.

In [157]:
G = nx.DiGraph()

for src, targets in id_to_cited_ids.items():
    for tgt in targets:
        G.add_edge(src, tgt)
        
nx.set_node_attributes(G, id_to_attrs)

In [158]:
len(G.nodes)

2478

In [159]:
sinks = [n for n in G.nodes if G.out_degree(n) == 0]
sources = [n for n in G.nodes if G.in_degree(n) == 0]

In [160]:
len(sources)

874

In [169]:
G.node

{}

In [191]:
topics = [attrs.get("topic") if attrs.get("topic") is not None else "None" for _, attrs in G.nodes(data=True)]
domains = [attrs.get("domain") if attrs.get("domain") is not None else "None" for _, attrs in G.nodes(data=True)]
fields = [attrs.get("field") if attrs.get("field") is not None else "None" for _, attrs in G.nodes(data=True)]

In [203]:
Counter(topics).most_common(30)

[('None', 625),
 ('Particle physics theoretical and experimental studies', 75),
 ('Quantum Chromodynamics and Particle Interactions', 52),
 ('Cosmology and Gravitation Theories', 49),
 ('Black Holes and Theoretical Physics', 48),
 ('Matrix Theory and Algorithms', 43),
 ('Rings, Modules, and Algebras', 39),
 ('Electron and X-Ray Spectroscopy Techniques', 38),
 ('Mathematical Dynamics and Fractals', 32),
 ('Optimization and Variational Analysis', 28),
 ('Advanced Statistical Methods and Models', 26),
 ('Holomorphic and Operator Theory', 25),
 ('Spectral Theory in Mathematical Physics', 25),
 ('Quantum Mechanics and Applications', 24),
 ('Geometric Analysis and Curvature Flows', 24),
 ('Atomic and Molecular Physics', 22),
 ('High-pressure geophysics and materials', 22),
 ('Advanced Algebra and Geometry', 21),
 ('Advanced Topics in Algebra', 21),
 ('Economic theories and models', 21),
 ('Stability and Controllability of Differential Equations', 21),
 ('advanced mathematical theories', 21),

In [193]:
Counter(domains).most_common(10)

[('Physical Sciences', 1623),
 ('None', 625),
 ('Social Sciences', 162),
 ('Life Sciences', 55),
 ('Health Sciences', 13)]

In [194]:
Counter(fields).most_common(20)

[('None', 625),
 ('Physics and Astronomy', 528),
 ('Mathematics', 500),
 ('Computer Science', 214),
 ('Engineering', 185),
 ('Materials Science', 95),
 ('Economics, Econometrics and Finance', 89),
 ('Earth and Planetary Sciences', 59),
 ('Decision Sciences', 39),
 ('Chemistry', 23),
 ('Agricultural and Biological Sciences', 20),
 ('Neuroscience', 19),
 ('Environmental Science', 17),
 ('Business, Management and Accounting', 15),
 ('Biochemistry, Genetics and Molecular Biology', 15),
 ('Medicine', 11),
 ('Social Sciences', 9),
 ('Psychology', 7),
 ('Arts and Humanities', 3),
 ('Energy', 2)]

In [195]:
refs = [t for tgts in id_to_cited_ids.values() for t in tgts]
len(refs)

5163

In [None]:
check = []
for r in refs:
    check.append(r in to_nodes)
    
Counter(check)