In [1]:
# PyAlex and Data Creation imports
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import pyalex, os, csv
pyalex.config.email = "b.cliff@gwmail.gwu.edu"
from itertools import chain

In [2]:
# Data Validation imports
from pydantic import BaseModel, Field, ValidationError
from typing import Optional, List

#### Configuring Output Files

In [3]:
WORKS_FILE = './data/works.csv'
AUTHORS_FILE = './data/authors.csv'
TOPICS_FILE = './data/topics.csv'

In [32]:
# Removing any old/vestige files
for path in [WORKS_FILE, AUTHORS_FILE, TOPICS_FILE]:
    if os.path.exists(path):
        os.remove(path)

In [33]:
works_file = open(WORKS_FILE, 'w', newline='',encoding='utf-8')

In [34]:
# Creating the headers
works_writer = csv.writer(works_file)

In [35]:
# Writing the headers to the csv files
works_writer.writerow(['id', 'doi', 'title', 'publication_date', 'type', 'related_works', 'corresponding_author_ids'])

75

#### Creating Pydantic Models for Validation

In [36]:
class WorksModel(BaseModel):
    id: str
    doi: Optional[str] = None
    title: str
    publication_date: str
    type: Optional[str] = None
    related_works: List[str] = None
    corresponding_author_ids: List[str] = None

### Data retrieval from OpenAlex

In [37]:
oa_replace = 'https://openalex.org/'

In [38]:
works_pager = Works().select(['id', 'doi', 'title',
'publication_date', 'type', 'related_works', 'corresponding_author_ids']).paginate(per_page=200)

#### Data Cleaning

In [39]:
for page in works_pager:
    for raw in page:
        try:
            raw['id'] = raw['id'].replace(oa_replace, '')
            raw['related_works'] = [rel.replace(oa_replace, '') for rel in raw['related_works']]
            raw['corresponding_author_ids'] = [rel.replace(oa_replace, '') for rel in raw['corresponding_author_ids']]
            w = WorksModel(**raw)
            works_writer.writerow(
                [w.id,
                w.doi,
                w.title,
                w.publication_date,
                w.type,
                w.related_works,
                w.corresponding_author_ids]
            )
        except (ValidationError, KeyError) as e:
            print("Skipping invalid record:", e)
            continue
works_file.close()


In [None]:
# cleaned_ids = [work['id'].replace('https://openalex.org/', '') for work in chain(*works_pager)]
# cleaned_related_works = [[rel.replace('https://openalex.org/', '') for rel in work['related_works']] for work in chain(*works_pager)]


In [None]:
# for work in chain(*works_pager):
#     print(work['title'], work['publication_date'], work['type'])

In [None]:
# cleaned_related_works

#### Including Filter and Select

In [18]:
# Determining the keys for each Works() object
Works()['W2889162861'].keys()

dict_keys(['id', 'doi', 'title', 'display_name', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'indexed_in', 'open_access', 'authorships', 'institutions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'is_xpac', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'awards', 'funders', 'has_content', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'counts_by_year', 'updated_date', 'created_date'])

In [26]:
Works()['W2889162861']['abstract_inverted_index']

{'The': [0, 137],
 'TRIGA': [1],
 'Mark-III': [2],
 'nuclear': [3, 71],
 'reactor': [4, 13],
 'has': [5],
 'several': [6],
 'intrinsic': [7],
 'security': [8],
 'specifications': [9],
 'related': [10],
 'with': [11, 73],
 'the': [12,
  21,
  27,
  39,
  46,
  63,
  66,
  70,
  78,
  87,
  90,
  95,
  99,
  102,
  108,
  116,
  121,
  124,
  128,
  132,
  135,
  155,
  159,
  162,
  166],
 'core': [14],
 'and': [15, 32, 83, 98, 127, 148],
 'a': [16, 59, 144],
 'biological': [17],
 'shield': [18, 26, 64, 109, 138],
 'provided': [19],
 'by': [20, 143],
 'bathtub': [22],
 'itself.': [23],
 'However,': [24],
 'to': [25, 29, 42, 76, 85, 114],
 'exposition': [28],
 'gamma': [30, 125],
 'rays': [31],
 'thermal': [33],
 'neutrons': [34],
 'is': [35, 112, 141],
 'vital': [36],
 'for': [37, 61, 65],
 'protecting': [38],
 'workers': [40],
 'exposed': [41],
 'ionizing': [43],
 'radiation': [44, 126],
 'or': [45],
 'incorporation': [47],
 'of': [48, 69, 89, 123, 134, 154, 161],
 'radioactive': [49],

In [27]:
for work in Works().get():
    if work['authorships'] != []:
        print(work)
        break

{'id': 'https://openalex.org/W3038568908', 'doi': 'https://doi.org/10.1585/pfr.15.2402039', 'title': 'Radiation Resistant Camera System for Monitoring Deuterium Plasma Discharges in the Large Helical Device', 'display_name': 'Radiation Resistant Camera System for Monitoring Deuterium Plasma Discharges in the Large Helical Device', 'publication_year': 2020, 'publication_date': '2020-06-08', 'ids': {'openalex': 'https://openalex.org/W3038568908', 'doi': 'https://doi.org/10.1585/pfr.15.2402039', 'mag': '3038568908'}, 'language': 'en', 'primary_location': {'id': 'doi:10.1585/pfr.15.2402039', 'is_oa': True, 'landing_page_url': 'https://doi.org/10.1585/pfr.15.2402039', 'pdf_url': 'https://www.jstage.jst.go.jp/article/pfr/15/0/15_2402039/_pdf', 'source': {'id': 'https://openalex.org/S46033839', 'display_name': 'Plasma and Fusion Research', 'issn_l': '1880-6821', 'issn': ['1880-6821'], 'is_oa': True, 'is_in_doaj': False, 'is_core': True, 'host_organization': 'https://openalex.org/P4328135220',