<a href="https://colab.research.google.com/github/ByungjunKim/OpenAlexAPI/blob/main/OpenAlex_Tokenization_TopicModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collecting Bibliographic Data Using the OpenAlex API, Tokenizing, and Applying Topic Modeling

In [None]:
pip install -U -q requests natsort tqdm pyalex tomotopy gensim nltk 'spacy[cuda-autodetect]'

In [None]:
# spacy 영어 모델 다운로드(다운로드 속도를 위해 sm(small) 모델 다운)
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import requests
from tqdm.auto import tqdm
tqdm.pandas()
import json
import glob
import time
import re
from natsort import natsorted
import itertools
import matplotlib.pyplot as plt
import pyalex
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders

import spacy
print(spacy.prefer_gpu()) #GPU 활용
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp.add_pipe('sentencizer')

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from collections import Counter
from itertools import chain

import tomotopy as tp
# print(tp.isa)
import sys

import numpy as np

### OpenAlex API (pyalex)

In [None]:
pyalex.config.email = "kuntakim88@gmail.com" # insert your email

In [None]:
def fetch_papers_by_title(title_keyword, start_year=None, end_year=None, test_mode=False):
    # Set the number of results per page based on test_mode
    per_page = 10 if test_mode else 200

    # Create a search filter with title and publication year range if provided
    search_filter = pyalex.Works().search_filter(title=title_keyword)
    if start_year and end_year:
        search_filter = search_filter.filter(from_publication_date=f"{start_year}-01-01").filter(to_publication_date=f"{end_year}-12-31")

    # Create a paginator to get all works with the specified keyword and year range
    pager = search_filter.paginate(per_page=per_page, n_max=None)

    # Get the total number of works
    total_works = search_filter.count()
    print(f"Total number of works related to '{title_keyword}' from {start_year} to {end_year}: {total_works}")

    papers = []

    # Iterate through each page and collect all bibliographic information (limit to one page if test_mode is True)
    for page in tqdm(pager, total=(total_works // per_page) + 1, desc="Collecting papers"):
        for work in page:
            papers.append(work)
        if test_mode:
            break  # Only collect one page for testing purposes

    # Print the total number of works collected
    print(f"Total number of works collected: {len(papers)}")

    return papers

In [None]:
# Example usage: fetch and display papers related to a specified keyword and year range
results = fetch_papers_by_title("renewable energy", start_year=2022, end_year=2024)

In [None]:
# pandas dataframe으로 변환
df = pd.DataFrame.from_dict(results)
df

In [None]:
df = df.drop_duplicates(subset=['id']).reset_index(drop=True)

In [None]:
df.columns

In [None]:
def index_to_text(abstract_inverted_index):
    abstract_index = {}
    for k, vlist in abstract_inverted_index.items():
        for v in vlist:
            abstract_index[v] = k
    abstract = ' '.join(abstract_index[k] for k in sorted(abstract_index.keys()))
    return abstract

In [None]:
# index to text
df['abstract'] = df[~pd.isna(df['abstract_inverted_index'])]['abstract_inverted_index'].progress_map(lambda x:index_to_text(x))
df['abstract']

In [None]:
df = df.dropna(subset=['abstract']).reset_index(drop=True) # 초록이 없는 행 삭제

In [None]:
len(df)

### Tokenization with spaCy

In [None]:
# https://spacy.io/usage/linguistic-features
#Lemmatization 처리된 토큰 추출 (https://wikidocs.net/21707)
df['tokens'] = df['abstract'].progress_map(lambda x:[token.lemma_+'/'+token.pos_ for token in nlp(x)])
# df['tokens'] = df['abstract'].progress_map(lambda x:[token.lemma_ for token in nlp(x)])
df['tokens']

In [None]:
allowed_postags = ['ADJ','NOUN','VERB','PROPN','ADV'] # 추출하고 싶은 품사 리스트 (형용사, 명사, 동사, 고유명사, 부사)

In [None]:
df['allowed_tokens'] = df['tokens'].map(lambda x:[token for token in x if token.split('/')[1] in allowed_postags])
df['allowed_tokens']

In [None]:
# top n Unigram
unigram = chain(*df['allowed_tokens'])
cnt = Counter(unigram)
cnt.most_common(30) # Top N

In [None]:
stop_words = ['%/NOUN', 'paper/NOUN', 'research/NOUN', 'study/NOUN']

In [None]:
# remove stopwords
df['allowed_tokens'] = df['allowed_tokens'].map(lambda x:[t for t in x if not t in stop_words])

In [None]:
# top n Unigram
unigram = chain(*df['allowed_tokens'])
cnt = Counter(unigram)
cnt.most_common(30) # Top N

### Topic Model with tomotopy

In [None]:
df['topics_name'] = df['topics'].dropna().progress_map(lambda x:[t['display_name'] for t in x])
df['subfield_name'] = df['topics'].dropna().progress_map(lambda x:[t['subfield']['display_name'] for t in x])
df['field_name'] = df['topics'].dropna().progress_map(lambda x:[t['field']['display_name'] for t in x])
df['domain_name'] = df['topics'].dropna().progress_map(lambda x:[t['domain']['display_name'] for t in x])

In [None]:
df.loc[df['topics_name'].str.len()>0,'topics_name'].map(lambda x:x[0]).value_counts() # prime topic

In [None]:
df.loc[df['field_name'].str.len()>0,'field_name'].map(lambda x:x[0]).value_counts()

### 1. LDA
https://bab2min.github.io/tomotopy

##### 모델 초기화

In [None]:
LDA = tp.LDAModel(k=10,min_df=10,tw=tp.TermWeight.PMI, rm_top=3, seed=2021) # Hyperparameter

##### 빈 모델에 토큰 리스트 넣어주기

In [None]:
for token in tqdm(df['allowed_tokens'].tolist()):
    LDA.add_doc(token)

##### 모델 학습

In [None]:
LDA.train(0)

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(LDA.docs), ', Vocab size:', LDA.num_vocabs, ', Num words:', LDA.num_words)
print('Removed top words:', LDA.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    LDA.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, LDA.ll_per_word))

In [None]:
# 학습 결과
LDA.summary()

##### 토픽별 TopN 단어 확인

In [None]:
for i in range(LDA.k):
    res = LDA.get_topic_words(i, top_n=10) # top 10
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

##### 토픽 이름 자동으로 붙이기 (참고용)

In [None]:
# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=10, max_len=5, max_cand=10000)
cands = extractor.extract(LDA)

labeler = tp.label.FoRelevance(LDA, cands, min_df=10, smoothing=1e-2, mu=0.25)
for k in range(LDA.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in LDA.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

In [None]:
# !sudo apt-get update
# !sudo apt-get install -y locales
# !sudo locale-gen en_US.UTF-8
# !sudo update-locale LANG=en_US.UTF-8
# import locale
# locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# # Install tornado after setting the locale
# !pip install tornado --upgrade

In [None]:
# https://github.com/bab2min/tomotopy/blob/0be609df83e606cc8c14240f5b552096f9435351/README.rst#interactive-model-viewer
# tp.viewer.open_viewer(LDA, host="localhost", port=9998)

### 2. DTM (Dynamic Topic Model)
"Time series topic model"

In [None]:
df['publication_year'].value_counts()

In [None]:
# DTM에서 t(시간 하이퍼하라미터) 형식으로 변환(0부터 시작)
df['publication_year'] = df['publication_year'] - df['publication_year'].min()

In [None]:
# 0 : 2022, 1:2023, 2:2024
df['publication_year'].value_counts()

##### 모델 초기화

In [None]:
# t = 3 이면 3개의 시기에 대한 DTM
DTM = tp.DTModel(k=10,min_df=10,tw=tp.TermWeight.PMI,t=3,rm_top=3, seed=2021)

##### 빈 모델에 토큰 리스트와 시간변수 넣어주기

In [None]:
token_year_dict = df[['allowed_tokens','publication_year']].to_dict('index')

In [None]:
token_year_dict[0]

In [None]:
for k in tqdm(token_year_dict.keys()):
    DTM.add_doc(token_year_dict[k]['allowed_tokens'],token_year_dict[k]['publication_year'])

##### 모델 학습

In [None]:
DTM.train(0)

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(DTM.docs), ', Vocab size:', DTM.num_vocabs, ', Num words:', DTM.num_words)
print('Removed top words:', DTM.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    DTM.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, DTM.ll_per_word))

In [None]:
# 학습 결과
DTM.summary()

##### 토픽별 TopN 단어 확인

In [None]:
for i in range(DTM.k):
    for t in range(3):
        res = DTM.get_topic_words(i,t,top_n=10)
        print('Topic #{}'.format(i), end='\t')
        print(', '.join(w for w, p in res))

##### 토픽 이름 자동으로 붙이기(참고용)

In [None]:
# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=10, max_len=5, max_cand=10000)
cands = extractor.extract(DTM)

labeler = tp.label.FoRelevance(DTM, cands, min_df=10, smoothing=1e-2, mu=0.25)
for k in range(DTM.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for t in range(3):
        for word, prob in DTM.get_topic_words(k, t,top_n=10):
            print(word, prob, sep='\t')
        print()

##### 시간에 따른 토픽 비중 변화

In [None]:
topic_dist_by_time = np.zeros(shape=[DTM.num_timepoints, DTM.k], dtype=np.float64)
for doc in DTM.docs:
    topic_dist_by_time[doc.timepoint] += doc.get_topic_dist()

topic_dist_by_time /= DTM.num_docs_by_timepoint[:, np.newaxis]

for k in range(DTM.k):
    print('Topic #{}'.format(k), *(w for w, _ in DTM.get_topic_words(k, 0, top_n=5)))
    print(topic_dist_by_time[:, k])

In [None]:
topic_dist_by_time = pd.DataFrame(topic_dist_by_time)
topic_dist_by_time.index = [2022,2023,2024]

In [None]:
f = plt.figure()
plt.title('Topic distribution by year', color='black')
topic_dist_by_time.plot(ax=f.gca(),colormap='gnuplot')
plt.xticks(np.array(range(2022,2025)),rotation=45)
ax=f.gca()
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
ax.set_ylabel('Percent')
plt.show()

### 3. DMR (Dirichlet Multinomial Regression)
"Topic proportion differences by categorical variables"

##### setting up categorical variables

In [None]:
df.loc[df['field_name'].str.len()>0,'field_name'].map(lambda x:x[0]).value_counts()

In [None]:
# Top 4 field_name
top4_field = df.loc[df['field_name'].str.len()>0,'field_name'].map(lambda x:x[0]).value_counts().index[:4].tolist()

In [None]:
df_field = df.loc[df['field_name'].str.len()>0].reset_index(drop=True)

In [None]:
df_field['field_name'] = df_field['field_name'].map(lambda x:x[0])

In [None]:
# prompt: top4_field가 아니면 other로
df_field['field_name_top4'] = df_field['field_name'].apply(lambda x: x if x in top4_field else 'other')

##### 모델 초기화

In [None]:
DMR = tp.DMRModel(k=10,min_df=10,tw=tp.TermWeight.PMI,rm_top=3, seed=2021)

##### 빈 모델에 토큰 리스트와 명목변수 넣어주기

In [None]:
token_cate_dict = df_field[['allowed_tokens','field_name_top4']].to_dict('index')

In [None]:
for k in tqdm(token_cate_dict.keys()):
    DMR.add_doc(token_cate_dict[k]['allowed_tokens'],token_cate_dict[k]['field_name_top4'])

##### 모델 학습

In [None]:
DMR.train(0)

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(DMR.docs), ', Vocab size:', DMR.num_vocabs, ', Num words:', DMR.num_words)
print('Removed top words:', DMR.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    DMR.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, DMR.ll_per_word))

##### 토픽별 TopN 단어 확인

In [None]:
for i in range(DMR.k):
    res = DMR.get_topic_words(i, top_n=10)
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

##### 토픽 이름 자동으로 붙이기 (참고용)

In [None]:
# extract candidates for auto topic labeling
extractor = tp.label.PMIExtractor(min_cf=10, min_df=10, max_len=5, max_cand=10000)
cands = extractor.extract(DMR)

labeler = tp.label.FoRelevance(DMR, cands, min_df=10, smoothing=1e-2, mu=0.25)
for k in range(DMR.k):
    print("== Topic #{} ==".format(k))
    print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
    for word, prob in DMR.get_topic_words(k, top_n=10):
        print(word, prob, sep='\t')
    print()

##### metadata에 따른 토픽 분포 대조
https://github.com/bab2min/tomotopy/blob/main/examples/dmr_plot.py

In [None]:
DMR.metadata_dict

In [None]:
# calculate topic distribution for each metadata using softmax
probs = np.exp(DMR.lambdas - DMR.lambdas.max(axis=0))
probs /= probs.sum(axis=0)

print('Topic proportions by categorical variable')
for f, metadata_name in enumerate(DMR.metadata_dict):
    print(metadata_name, probs[:, f], '\n')

x = np.arange(DMR.k)
width = 1 / (DMR.f + 2)

fig, ax = plt.subplots(figsize=(10, 6))  # Increase figure size for better visibility
for f, metadata_name in enumerate(DMR.metadata_dict):
    ax.bar(x + width * (f - DMR.f / 2), probs[:, f], width, label=DMR.metadata_dict[f])

ax.set_ylabel('Probabilities')
ax.set_yscale('log')
ax.set_title('Topic proportions by categorical variable')
ax.set_xticks(x)
ax.set_xticklabels(['Topic #{}'.format(k) for k in range(DMR.k)], rotation=45)  # Rotate x-axis labels by 45 degrees

# Move the legend outside of the plot
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust subplot parameters to give more room to the legend
fig.subplots_adjust(right=0.75)  # Adjust the right space to accommodate the legend

plt.show()
