In [None]:
# PyAlex and Data Creation imports
from pyalex import Works, Authors, Sources, Institutions, Topics, Publishers, Funders
import pyalex, os, csv
pyalex.config.email = "b.cliff@gwmail.gwu.edu"
from itertools import chain

In [2]:
# Data Validation imports
from pydantic import BaseModel, Field, ValidationError
from typing import Optional, List

#### Configuring Output Files

In [3]:
data_folder_loc = './data'

In [4]:
WORKS_FILE = f'{data_folder_loc}/works.csv'
AUTHORS_FILE = f'{data_folder_loc}/authors.csv'
TOPICS_FILE = f'{data_folder_loc}/topics.csv'
WORK_AUTH_FILE = f'{data_folder_loc}/work_auth_edges.csv'
WORK_TOPIC_FILE = f'{data_folder_loc}/work_topic_edges.csv'
INSTITUTIONS_FILE = f'{data_folder_loc}/institutions.csv'
SOURCES_FILE = f'{data_folder_loc}/sources.csv'

In [5]:
# Removing any old/vestige files
for path in [WORKS_FILE, AUTHORS_FILE, TOPICS_FILE, WORK_AUTH_FILE,
             WORK_TOPIC_FILE, INSTITUTIONS_FILE, SOURCES_FILE]:
    if os.path.exists(path):
        os.remove(path)

In [None]:
works_file = open(WORKS_FILE, 'w', newline='',encoding='utf-8')
authors_file = open(AUTHORS_FILE, 'w', newline='', encoding='utf-8')
topics_file = open(TOPICS_FILE, 'w', newline='', encoding='utf-8')
wa_edge_file = open(WORK_AUTH_FILE, 'w', newline='', encoding='utf-8')

In [None]:
wt_edge_file = open(WORK_TOPIC_FILE, 'w', newline='', encoding='utf-8')

In [None]:
# Creating the headers
works_writer = csv.writer(works_file)
authors_writer = csv.writer(authors_file)
topics_writer = csv.writer(topics_file)
wa_edges_writer = csv.writer(wa_edge_file)

In [None]:
wt_edges_writer = csv.writer(wt_edge_file)

In [None]:
# Writing the headers to the csv files
works_writer.writerow(['id', 'doi', 'title', 'publication_date', 'type', 'related_works', 'corresponding_author_ids'])
authors_writer.writerow(['id', 'name', 'works_count', 'cited_by_count', 'affiliations', 'topics', 'counts_by_year'])
wa_edges_writer.writerow(['work_id', 'author_id', 'author_position', 'is_corresponding'])

In [None]:
wt_edges_writer.writerow(['work_id', 'topic_id', 'topic_name', 'score'])

In [None]:
topics_writer.writerow(['id', 'topic_name', 'description', 'keywords', 
                        'subfield', 'field', 'domain', 'works_count',
                        'cited_by_count' ,'updated_date' ,'created_date'])

#### Creating Pydantic Models for Validation

In [None]:
class WorksModel(BaseModel):
    id: str
    doi: Optional[str] = None
    title: str
    publication_date: str
    type: Optional[str] = None
    related_works: List[str] = None
    corresponding_author_ids: List[str] = None

In [None]:
class AuthorsModel(BaseModel):
    id: str
    display_name: str
    works_count: int
    cited_by_count: int
    affiliations: List[dict] = None
    topics: List[dict] = None
    counts_by_year: List[dict]

In [None]:
class TopicsModel(BaseModel):
    id: str
    display_name: str
    description: str
    keywords: List[str] = None
    subfield_name: Optional[str] = None
    field_name: Optional[str] = None
    domain_name: Optional[str] = None
    works_count: int
    cited_by_count: int
    updated_date: str
    created_date: str

### Data retrieval from OpenAlex

In [3]:
oa_replace = 'https://openalex.org/'

In [4]:
works_pager = Works().select(['id', 'doi', 'title',
'publication_date', 'type', 'related_works', 'corresponding_author_ids']).paginate(per_page=200)

In [5]:
authors_pager = Authors().select(['id', 'orcid', 'display_name',
                                  'works_count', 'cited_by_count',
                                  'affiliations', 'topics', 'counts_by_year']).paginate(per_page=200)

In [6]:
topics_pager = Topics().select(['id', 'display_name', 'description', 'keywords',
                                'subfield', 'field', 'domain', 'works_count',
                                'cited_by_count', 'updated_date', 'created_date']).paginate(per_page=200)
# not including siblings for now - complicated

In [11]:
Authors().random()

{'id': 'https://openalex.org/A5057583326',
 'orcid': None,
 'display_name': 'Hang Yu Wang',
 'display_name_alternatives': ['Hang Yu Wang'],
 'works_count': 9,
 'cited_by_count': 12,
 'summary_stats': {'2yr_mean_citedness': 0.0, 'h_index': 2, 'i10_index': 0},
 'ids': {'openalex': 'https://openalex.org/A5057583326', 'orcid': None},
 'affiliations': [{'institution': {'id': 'https://openalex.org/I4210133467',
    'ror': 'https://ror.org/03qn8zy63',
    'display_name': "Xi'an Aeronautical University",
    'country_code': 'CN',
    'type': 'education',
    'lineage': ['https://openalex.org/I4210133467']},
   'years': [2012]},
  {'institution': {'id': 'https://openalex.org/I2800710378',
    'ror': 'https://ror.org/056vyez31',
    'display_name': 'Naval University of Engineering',
    'country_code': 'CN',
    'type': 'education',
    'lineage': ['https://openalex.org/I2800710378']},
   'years': [2014, 2013, 2012, 2011]}],
 'last_known_institutions': [{'id': 'https://openalex.org/I2800710378',

#### Data Cleaning

In [None]:
for page in works_pager:
    for raw in page:
        try:
            raw['id'] = raw['id'].replace(oa_replace, '')
            raw['related_works'] = [rel.replace(oa_replace, '') for rel in raw['related_works']]
            raw['corresponding_author_ids'] = [rel.replace(oa_replace, '') for rel in raw['corresponding_author_ids']]
            w = WorksModel(**raw)
            works_writer.writerow(
                [w.id,
                w.doi,
                w.title,
                w.publication_date,
                w.type,
                w.related_works,
                w.corresponding_author_ids]
            )
        except (ValidationError, KeyError) as e:
            print("Skipping invalid record:", e)
            continue
works_file.close()


In [None]:
for page in authors_pager:
    for raw in page:
        try:
            raw['id'] = raw['id'].replace(oa_replace, '')
            a = AuthorsModel(**raw)
            authors_writer.writerow([
                a.id,
                a.display_name,
                a.works_count,
                a.cited_by_count,
                a.affiliations,
                a.topics,
                a.counts_by_year
            ])
        except (ValidationError, KeyError) as e:
            print("Skipping invalid record:", e)
            continue
authors_file.close()

In [None]:
for page in topics_pager:
    for raw in page:
        raw['id'] = raw['id'].replace(oa_replace, '')
        raw['subfield'] = raw['subfield'].get('display_name', '')
        raw['field'] = raw['field'].get('display_name', '')
        raw['domain'] = raw['domain'].get('display_name', '')
        t = TopicsModel(**raw)
        topics_writer.writerow([
            t.id,
            t.display_name,
            t.description,
            t.subfield_name,
            t.field_name,
            t.domain_name,
            t.works_count,
            t.cited_by_count,
            t.updated_date,
            t.created_date,
        ])
topics_file.close()

#### Including Filter and Select

In [None]:
# Determining the keys for each Works() object
Works()['W2889162861'].keys()

In [None]:
Works()['W2889162861']['abstract_inverted_index']

In [None]:
for work in Works().get():
    if work['authorships'] != []:
        print(work)
        break

#### Relationship Creation attempt

In [None]:
class WorkAuthoredByEdge(BaseModel):
    work_id: str
    author_id: str
    author_position: str
    is_corresponding: bool

In [None]:
class WorkHasTopic(BaseModel):
    work_id: str
    topic_id: str
    score: Optional[float] = None

class WorkCitesWork(BaseModel):
    citing_work_id: str
    cited_work_id: str

class AuthorHasTopic(BaseModel):
    author_id: str
    topic_id: str
    score: Optional[float] = None

In [None]:
works_pager = Works().select(['id', 'authorships']).paginate(per_page=200)

In [None]:
works_pager_topics = Works().select(['id', 'topics']).paginate(per_page=200)

In [None]:
works_pager_citations = Works().select(['id', 'referenced_works']).paginate(per_page=200)

In [None]:
for page in works_pager:
    for raw in page:
        work_id = raw.get('id')
        if not work_id:
            continue
        
        work_id = work_id.replace(oa_replace, '')

        for auth in raw.get('authorships') or []:
            author_block = auth.get("author") or {}
            author_id = author_block.get("id")

            if not author_id:
                continue

            try:
                edge = WorkAuthoredByEdge(
                    work_id=work_id,
                    author_id=author_id.replace(oa_replace, ""),
                    author_position=auth.get("author_position"),
                    is_corresponding=auth.get("is_corresponding")
                )

                wa_edges_writer.writerow([
                    edge.work_id,
                    edge.author_id,
                    edge.author_position,
                    edge.is_corresponding
                ])

            except ValidationError:
                continue


wa_edge_file.close()

In [None]:
for page in works_pager_topics:
    for raw in page:
        work_id = raw.get('id')
        if not work_id:
            continue

        work_id = work_id.replace(oa_replace, '')

        for topic in raw['topics'] or []:
            topic_id = topic['id'].replace(oa_replace, '')
            topic_name = topic.get('display_name')
            topic_score = topic.get('score')

            wt_edges_writer.writerow([
                work_id,
                topic_id,
                topic_name,
                topic_score
            ])
wt_edge_file.close()            