In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Obtaining data from Springernature Open Access API
### Manual Search via query

In [64]:
# We will first return urls of open access articles from the springer link group, filtered manually by search query

query = 'spinal decompression' # search query: field or subfield
user_key = os.getenv('SPRINGER_API_KEY')
index = 0 # first index to return
num_return = 10 # number of results to return per api call
url = f'https://api.springernature.com/openaccess/json?api_key={user_key}&q={query}&s={index}&p={num_return}'

open_source_article_urls = []
titles = []
for i in range(num_return):
    try: # check whether the type is article and whether it doesn't exceed 10 pages
        if (requests.get(url).json()['records'][i]['contentType'] == 'Article') \
        & (int(requests.get(url).json()['records'][i]['endingPage']) \
        - int(requests.get(url).json()['records'][i]['startingPage']) <= 10):
            open_source_article_urls.append(requests.get(url).json()['records'][i]['url'][0]['value'])
            titles.append(requests.get(url).json()['records'][i]['title'])
    except:
        continue
    
open_source_article_urls[:5], titles[:5]

(['http://dx.doi.org/10.1007/s00256-022-04051-3',
  'http://dx.doi.org/10.1007/s00264-022-05485-z',
  'http://dx.doi.org/10.1007/s40122-022-00395-9',
  'http://dx.doi.org/10.1007/s12325-022-02246-7',
  'http://dx.doi.org/10.1186/s12891-022-05768-x'],
 ['Predictive value of immediate pain relief after lumbar transforaminal epidural injection with local anesthetics and steroids for single level radiculopathy',
  'Comparison of hidden blood loss and clinical efficacy of percutaneous endoscopic transforaminal lumbar interbody fusion and minimally invasive transforaminal lumbar interbody fusion',
  'Bilateral Erector Spinae Plane Block for Quality of Recovery Following Posterior Lumbar Interbody Fusion: A Randomized Controlled Trial',
  'Brain Abnormalities in PIK3CA-Related Overgrowth Spectrum: Physician, Patient, and Caregiver Experiences',
  'Finite element analysis after rod fracture of the spinal hybrid elastic rod system'])

In [65]:
# Now for the obtained urls, we will scrape the pdf-urls of the articles

pdf_links = []
for link in open_source_article_urls:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    pdf_links.append(str(soup.find_all('a', href=re.compile(r'(.pdf)'))).split('href="')[1].split('">\n')[0].strip('/'))
pdf_links[:5]

['https://link.springer.com/content/pdf/10.1007/s00256-022-04051-3.pdf',
 'https://link.springer.com/content/pdf/10.1007/s00264-022-05485-z.pdf',
 'https://link.springer.com/content/pdf/10.1007/s40122-022-00395-9.pdf',
 'https://link.springer.com/content/pdf/10.1007/s12325-022-02246-7.pdf',
 'bmcmusculoskeletdisord.biomedcentral.com/track/pdf/10.1186/s12891-022-05768-x.pdf']

In [66]:
# Saving the result in a pandas DataFrame

link_data = pd.DataFrame({'Title':titles,'PDF_link':pdf_links})
link_data.head()

Unnamed: 0,Title,PDF_link
0,Predictive value of immediate pain relief afte...,https://link.springer.com/content/pdf/10.1007/...
1,Comparison of hidden blood loss and clinical e...,https://link.springer.com/content/pdf/10.1007/...
2,Bilateral Erector Spinae Plane Block for Quali...,https://link.springer.com/content/pdf/10.1007/...
3,Brain Abnormalities in PIK3CA-Related Overgrow...,https://link.springer.com/content/pdf/10.1007/...
4,Finite element analysis after rod fracture of ...,bmcmusculoskeletdisord.biomedcentral.com/track...


#### But: apart from maybe the search queries, we're lacking the field of research (target).

# Obtaining data from OpenAlex

#### OpenAlex is a knowledge graph that already comes with linkage and a labeling for the subdisciplines, see the [OpenAlex Concepts](https://docs.openalex.org/about-the-data/concept). This API returns 33407041 results!!!

In [43]:
# Getting links to open access PDFs from mixed disciplines and subtopics from page 1

page = 1
res_per_page = 25 # number of results per page, must be between 25 and 200
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'

open_source_article_pdfs = []
publication_years = []
concept_names = []
subconcept_names = []
for i in range(res_per_page): # check whether the type is 'journal article' and whether it has a open access url pointing to a pdf and if it has topis or subtopics
    if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
    & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
        try:
            for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                if concept['level'] == 0:
                    topic = concept['display_name']
                elif concept['level'] == 1:
                    subtopic = concept['display_name']
            if concept and subtopic: # only append information to lists if concepts and subconcepts have been found
                publication_years.append(requests.get(url_openalex).json()['results'][i]['publication_year'])
                open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
                subconcept_names.append(subtopic)
                concept_names.append(topic)
        except:
            continue
    
link_data = pd.DataFrame({'pdf_link':open_source_article_pdfs, 'year': publication_years, 'concepts': concept_names, 'subconcepts': subconcept_names})
link_data.to_csv(f'../raw_data/page{page}_link_data.csv', index=False)

In [37]:
link_data # --> out of 25 results from the first page of the json, 11 fulfilled our conditions

Unnamed: 0,pdf_link,year,concepts,subconcepts
0,http://journals.iucr.org/a/issues/2008/01/00/s...,2008,Computer science,Software engineering
1,https://academic.oup.com/nar/article-pdf/25/17...,1997,Computer science,Computational biology
2,https://link.springer.com/content/pdf/10.1023%...,2001,Mathematics,Artificial intelligence
3,http://www.pnas.org/content/76/9/4350.full.pdf,1979,Chemistry,Biochemistry
4,https://repositorio.unal.edu.co/bitstream/unal...,1974,Computer science,Algorithm
5,https://academic.oup.com/bioinformatics/articl...,2009,Computer science,Computational biology
6,https://academic.oup.com/nar/article-pdf/25/24...,1997,Biology,Computational biology
7,https://academic.oup.com/mbe/article-pdf/28/10...,2011,Biology,Computational biology
8,https://www.jstatsoft.org/index.php/jss/articl...,2015,Computer science,Mathematical optimization
9,https://academic.oup.com/bioinformatics/articl...,2009,Computer science,Algorithm


In [2]:
# Filling a csv with more results (ran in Colab with Google Drive)

res_per_page = 100
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
for page in range(100):
    url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'

    open_source_article_pdfs = []
    publication_years = []
    concept_names = []
    subconcept_names = []
    for i in range(res_per_page): # check whether the type is 'journal article' and whether it has a open access url pointing to a pdf and if it has topis or subtopics
        try:
            if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
            & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
                for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                    if concept['level'] == 0:
                        topic = concept['display_name']
                    elif concept['level'] == 1:
                        subtopic = concept['display_name']
                if concept and subtopic: # only append information to lists if concepts and subconcepts have been found
                    publication_years.append(requests.get(url_openalex).json()['results'][i]['publication_year'])
                    open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
                    subconcept_names.append(subtopic)
                    concept_names.append(topic)
        except:
            continue
    
    # creating a new csv for every page in the json (100 entries), otherwiese could use "if page%10 == 0:""
    link_data = pd.DataFrame({'pdf_link':open_source_article_pdfs, 'year': publication_years, 'concepts': concept_names, 'subconcepts': subconcept_names})
    link_data.to_csv(f'../raw_data/page{page}_link_data.csv', index=False)