In [None]:
# PyAlex and Data Creation imports
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import pyalex, os, csv
pyalex.config.email = "b.cliff@gwmail.gwu.edu"
from itertools import chain

In [None]:
# Data Validation imports
from pydantic import BaseModel, Field, ValidationError
from typing import Optional, List

#### Configuring Output Files

In [None]:
WORKS_FILE = './data/works.csv'
AUTHORS_FILE = './data/authors.csv'
TOPICS_FILE = './data/topics.csv'

In [None]:
# Removing any old/vestige files
for path in [WORKS_FILE, AUTHORS_FILE, TOPICS_FILE]:
    if os.path.exists(path):
        os.remove(path)

In [None]:
works_file = open(WORKS_FILE, 'w', newline='',encoding='utf-8')

In [None]:
# Creating the headers
works_writer = csv.writer(works_file)

In [None]:
# Writing the headers to the csv files
works_writer.writerow(['id', 'doi', 'title', 'publication_date', 'type', 'related_works', 'corresponding_author_ids'])

#### Creating Pydantic Models for Validation

In [None]:
class WorksModel(BaseModel):
    id: str
    doi: Optional[str] = None
    title: str
    publication_date: str
    type: Optional[str] = None
    related_works: List[str] = None
    corresponding_author_ids: List[str] = None

### Data retrieval from OpenAlex

In [None]:
oa_replace = 'https://openalex.org/'

In [None]:
works_pager = Works().select(['id', 'doi', 'title',
'publication_date', 'type', 'related_works', 'corresponding_author_ids']).paginate(per_page=200)

#### Data Cleaning

In [None]:
for page in works_pager:
    for raw in page:
        try:
            raw['id'] = raw['id'].replace(oa_replace, '')
            raw['related_works'] = [rel.replace(oa_replace, '') for rel in raw['related_works']]
            raw['corresponding_author_ids'] = [rel.replace(oa_replace, '') for rel in raw['corresponding_author_ids']]
            w = WorksModel(**raw)
            works_writer.writerow(
                [w.id,
                w.doi,
                w.title,
                w.publication_date,
                w.type,
                w.related_works,
                w.corresponding_author_ids]
            )
        except (ValidationError, KeyError) as e:
            print("Skipping invalid record:", e)
            continue
works_file.close()


In [None]:
# cleaned_ids = [work['id'].replace('https://openalex.org/', '') for work in chain(*works_pager)]
# cleaned_related_works = [[rel.replace('https://openalex.org/', '') for rel in work['related_works']] for work in chain(*works_pager)]


In [None]:
# for work in chain(*works_pager):
#     print(work['title'], work['publication_date'], work['type'])

In [None]:
# cleaned_related_works

#### Including Filter and Select

In [None]:
# Determining the keys for each Works() object
Works()['W2889162861'].keys()

In [None]:
Works()['W2889162861']['abstract_inverted_index']

In [None]:
for work in Works().get():
    if work['authorships'] != []:
        print(work)
        break