In [7]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Obtaining data from Springernature Open Access API
### Manual Search via query

In [64]:
# We will first return urls of open access articles from the springer link group, filtered manually by search query

query = 'spinal decompression' # search query: field or subfield
user_key = os.getenv('SPRINGER_API_KEY')
index = 0 # first index to return
num_return = 10 # number of results to return per api call
url = f'https://api.springernature.com/openaccess/json?api_key={user_key}&q={query}&s={index}&p={num_return}'

open_source_article_urls = []
titles = []
for i in range(num_return):
    try: # check whether the type is article and whether it doesn't exceed 10 pages
        if (requests.get(url).json()['records'][i]['contentType'] == 'Article') \
        & (int(requests.get(url).json()['records'][i]['endingPage']) \
        - int(requests.get(url).json()['records'][i]['startingPage']) <= 10):
            open_source_article_urls.append(requests.get(url).json()['records'][i]['url'][0]['value'])
            titles.append(requests.get(url).json()['records'][i]['title'])
    except:
        continue
    
open_source_article_urls[:5], titles[:5]

(['http://dx.doi.org/10.1007/s00256-022-04051-3',
  'http://dx.doi.org/10.1007/s00264-022-05485-z',
  'http://dx.doi.org/10.1007/s40122-022-00395-9',
  'http://dx.doi.org/10.1007/s12325-022-02246-7',
  'http://dx.doi.org/10.1186/s12891-022-05768-x'],
 ['Predictive value of immediate pain relief after lumbar transforaminal epidural injection with local anesthetics and steroids for single level radiculopathy',
  'Comparison of hidden blood loss and clinical efficacy of percutaneous endoscopic transforaminal lumbar interbody fusion and minimally invasive transforaminal lumbar interbody fusion',
  'Bilateral Erector Spinae Plane Block for Quality of Recovery Following Posterior Lumbar Interbody Fusion: A Randomized Controlled Trial',
  'Brain Abnormalities in PIK3CA-Related Overgrowth Spectrum: Physician, Patient, and Caregiver Experiences',
  'Finite element analysis after rod fracture of the spinal hybrid elastic rod system'])

In [65]:
# Now for the obtained urls, we will scrape the pdf-urls of the articles

pdf_links = []
for link in open_source_article_urls:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    pdf_links.append(str(soup.find_all('a', href=re.compile(r'(.pdf)'))).split('href="')[1].split('">\n')[0].strip('/'))
pdf_links[:5]

['https://link.springer.com/content/pdf/10.1007/s00256-022-04051-3.pdf',
 'https://link.springer.com/content/pdf/10.1007/s00264-022-05485-z.pdf',
 'https://link.springer.com/content/pdf/10.1007/s40122-022-00395-9.pdf',
 'https://link.springer.com/content/pdf/10.1007/s12325-022-02246-7.pdf',
 'bmcmusculoskeletdisord.biomedcentral.com/track/pdf/10.1186/s12891-022-05768-x.pdf']

In [66]:
# Saving the result in a pandas DataFrame

link_data = pd.DataFrame({'Title':titles,'PDF_link':pdf_links})
link_data.head()

Unnamed: 0,Title,PDF_link
0,Predictive value of immediate pain relief afte...,https://link.springer.com/content/pdf/10.1007/...
1,Comparison of hidden blood loss and clinical e...,https://link.springer.com/content/pdf/10.1007/...
2,Bilateral Erector Spinae Plane Block for Quali...,https://link.springer.com/content/pdf/10.1007/...
3,Brain Abnormalities in PIK3CA-Related Overgrow...,https://link.springer.com/content/pdf/10.1007/...
4,Finite element analysis after rod fracture of ...,bmcmusculoskeletdisord.biomedcentral.com/track...


#### But: apart from maybe the search queries, we're lacking the field of research (target).

# Obtaining data from OpenAlex

#### OpenALex is a knowledge graph that has already links and a labeling for the subdisciplines, see the [OpenAlex Concepts](https://docs.openalex.org/about-the-data/concept).

In [2]:
# Getting works from mixed disciplines and topics with an open access PDF link

page = 1
res_per_page = 100 # number of results per page, must be between 25 and 200
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'


#### This API returns 33407041 results!!!

In [3]:
# Searching on page 1

open_source_article_pdfs = []
concept_ids = []
concept_names = []
for i in range(res_per_page):
    try: # check whether the type is article and whether it has a open access url pointing to a pdf
        if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
        & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
            for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                if concept['level'] == 0: # for the start we want to extract only the level 0 concepts
                    # when we go down to level 1, we also want to filter by score to get the best matching subdiscipline
                    concept_id = concept['id']
                    concept_name = concept['display_name']
                    concept_ids.append(concept_id)
                    concept_names.append(concept_name)
                    open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
    except:
        continue
    
link_data = pd.DataFrame({'PDF_link':open_source_article_pdfs, 'Concept': concept_names, 'ConceptID': concept_ids})
link_data

Unnamed: 0,PDF_link,Concept,ConceptID
0,http://journals.iucr.org/a/issues/2008/01/00/s...,Computer science,https://openalex.org/C41008148
1,https://academic.oup.com/nar/article-pdf/25/17...,Biology,https://openalex.org/C86803240
2,https://academic.oup.com/nar/article-pdf/25/17...,Computer science,https://openalex.org/C41008148
3,https://link.springer.com/content/pdf/10.1023%...,Mathematics,https://openalex.org/C33923547
4,http://www.pnas.org/content/76/9/4350.full.pdf,Chemistry,https://openalex.org/C185592680
...,...,...,...
59,http://genome.cshlp.org/content/20/9/1297.full...,Biology,https://openalex.org/C86803240
60,https://academic.oup.com/schizophreniabulletin...,Psychology,https://openalex.org/C15744967
61,https://serval.unil.ch/resource/serval:BIB_2B5...,Biology,https://openalex.org/C86803240
62,http://journals.iucr.org/j/issues/2007/04/00/h...,Computer science,https://openalex.org/C41008148


In [4]:
# Checking whether academic fields are evenly distributed within page 1 of the data (the are not)

link_data.groupby(by='Concept').count()

Unnamed: 0_level_0,PDF_link,ConceptID
Concept,Unnamed: 1_level_1,Unnamed: 2_level_1
Biology,20,20
Chemistry,3,3
Computer science,29,29
Materials science,1,1
Mathematics,6,6
Medicine,2,2
Psychology,3,3


In [8]:
# Filling up the DataFrame with more results

res_per_page = 100
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
for page in range(100):
    url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'
    
    open_source_article_pdfs = []
    concept_names = []

    for i in range(res_per_page):
        try: # check whether the type is article and whether it has a open access url pointing to a pdf
            if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
            & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
                for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                    if concept['level'] == 0: # for the start we want to extract only the level 0 concepts
                        # when we go down to level 1, we also want to filter by score to get the best matching subdiscipline
                        concept_names.append(concept['display_name'])
                        open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
        except:
            continue
    
    if page%10 == 0: # every 1000 entries create a new csv
        link_data = pd.DataFrame({'PDF_link':open_source_article_pdfs, 'Concept': concept_names})
        link_data.to_csv(f'raw_data/{page}p_link_data.csv', index=False)
        