In [3]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import glob

# First attempt: Obtaining data from Springernature Open Access API
### Manual Search via query

In [64]:
# We will first return urls of open access articles from the springer link group, filtered manually by search query

query = 'spinal decompression' # search query: field or subfield
user_key = os.getenv('SPRINGER_API_KEY')
index = 0 # first index to return
num_return = 10 # number of results to return per api call
url = f'https://api.springernature.com/openaccess/json?api_key={user_key}&q={query}&s={index}&p={num_return}'

open_source_article_urls = []
titles = []
for i in range(num_return):
    try: # check whether the type is article and whether it doesn't exceed 10 pages
        if (requests.get(url).json()['records'][i]['contentType'] == 'Article') \
        & (int(requests.get(url).json()['records'][i]['endingPage']) \
        - int(requests.get(url).json()['records'][i]['startingPage']) <= 10):
            open_source_article_urls.append(requests.get(url).json()['records'][i]['url'][0]['value'])
            titles.append(requests.get(url).json()['records'][i]['title'])
    except:
        continue
    
open_source_article_urls[:5], titles[:5]

(['http://dx.doi.org/10.1007/s00256-022-04051-3',
  'http://dx.doi.org/10.1007/s00264-022-05485-z',
  'http://dx.doi.org/10.1007/s40122-022-00395-9',
  'http://dx.doi.org/10.1007/s12325-022-02246-7',
  'http://dx.doi.org/10.1186/s12891-022-05768-x'],
 ['Predictive value of immediate pain relief after lumbar transforaminal epidural injection with local anesthetics and steroids for single level radiculopathy',
  'Comparison of hidden blood loss and clinical efficacy of percutaneous endoscopic transforaminal lumbar interbody fusion and minimally invasive transforaminal lumbar interbody fusion',
  'Bilateral Erector Spinae Plane Block for Quality of Recovery Following Posterior Lumbar Interbody Fusion: A Randomized Controlled Trial',
  'Brain Abnormalities in PIK3CA-Related Overgrowth Spectrum: Physician, Patient, and Caregiver Experiences',
  'Finite element analysis after rod fracture of the spinal hybrid elastic rod system'])

In [65]:
# Now for the obtained urls, we will scrape the pdf-urls of the articles

pdf_links = []
for link in open_source_article_urls:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    pdf_links.append(str(soup.find_all('a', href=re.compile(r'(.pdf)'))).split('href="')[1].split('">\n')[0].strip('/'))
pdf_links[:5]

['https://link.springer.com/content/pdf/10.1007/s00256-022-04051-3.pdf',
 'https://link.springer.com/content/pdf/10.1007/s00264-022-05485-z.pdf',
 'https://link.springer.com/content/pdf/10.1007/s40122-022-00395-9.pdf',
 'https://link.springer.com/content/pdf/10.1007/s12325-022-02246-7.pdf',
 'bmcmusculoskeletdisord.biomedcentral.com/track/pdf/10.1186/s12891-022-05768-x.pdf']

In [66]:
# Saving the result in a pandas DataFrame

link_data = pd.DataFrame({'Title':titles,'PDF_link':pdf_links})
link_data.head()

Unnamed: 0,Title,PDF_link
0,Predictive value of immediate pain relief afte...,https://link.springer.com/content/pdf/10.1007/...
1,Comparison of hidden blood loss and clinical e...,https://link.springer.com/content/pdf/10.1007/...
2,Bilateral Erector Spinae Plane Block for Quali...,https://link.springer.com/content/pdf/10.1007/...
3,Brain Abnormalities in PIK3CA-Related Overgrow...,https://link.springer.com/content/pdf/10.1007/...
4,Finite element analysis after rod fracture of ...,bmcmusculoskeletdisord.biomedcentral.com/track...


#### But: apart from maybe the search queries, we're lacking the field of research (target).

# More promising attempt: Obtaining data from OpenAlex

#### OpenAlex is a knowledge graph that already comes with linkage and a labeling for the subdisciplines, see the [OpenAlex Concepts](https://docs.openalex.org/about-the-data/concept). This API returns 33407041 results!!!

In [43]:
# Getting links to open access PDFs from mixed disciplines and subtopics from page 1

page = 1
res_per_page = 25 # number of results per page, must be between 25 and 200
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'

open_source_article_pdfs = []
publication_years = []
concept_names = []
subconcept_names = []
for i in range(res_per_page): # check whether the type is 'journal article' and whether it has a open access url pointing to a pdf and if it has topis or subtopics
    if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
    & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
        try:
            for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                if concept['level'] == 0:
                    topic = concept['display_name']
                elif concept['level'] == 1:
                    subtopic = concept['display_name']
            if concept and subtopic: # only append information to lists if concepts and subconcepts have been found
                publication_years.append(requests.get(url_openalex).json()['results'][i]['publication_year'])
                open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
                subconcept_names.append(subtopic)
                concept_names.append(topic)
        except:
            continue
    
link_data = pd.DataFrame({'pdf_link':open_source_article_pdfs, 'year': publication_years, 'concepts': concept_names, 'subconcepts': subconcept_names})
link_data.to_csv(f'../raw_data/page{page}_link_data.csv', index=False)

In [37]:
link_data # --> out of 25 results from the first page of the json, 11 fulfilled our conditions

Unnamed: 0,pdf_link,year,concepts,subconcepts
0,http://journals.iucr.org/a/issues/2008/01/00/s...,2008,Computer science,Software engineering
1,https://academic.oup.com/nar/article-pdf/25/17...,1997,Computer science,Computational biology
2,https://link.springer.com/content/pdf/10.1023%...,2001,Mathematics,Artificial intelligence
3,http://www.pnas.org/content/76/9/4350.full.pdf,1979,Chemistry,Biochemistry
4,https://repositorio.unal.edu.co/bitstream/unal...,1974,Computer science,Algorithm
5,https://academic.oup.com/bioinformatics/articl...,2009,Computer science,Computational biology
6,https://academic.oup.com/nar/article-pdf/25/24...,1997,Biology,Computational biology
7,https://academic.oup.com/mbe/article-pdf/28/10...,2011,Biology,Computational biology
8,https://www.jstatsoft.org/index.php/jss/articl...,2015,Computer science,Mathematical optimization
9,https://academic.oup.com/bioinformatics/articl...,2009,Computer science,Algorithm


In [2]:
# Filling a csv with more results (ran in Colab with Google Drive)

res_per_page = 100
filter_condition_oa = 'has_oa_accepted_or_published_version:true'
for page in range(100):
    url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&page={page}&per-page={res_per_page}'

    open_source_article_pdfs = []
    publication_years = []
    concept_names = []
    subconcept_names = []
    for i in range(res_per_page): # check whether the type is 'journal article' and whether it has a open access url pointing to a pdf and if it has topis or subtopics
        try:
            if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
            & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
                for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                    if concept['level'] == 0:
                        topic = concept['display_name']
                    elif concept['level'] == 1:
                        subtopic = concept['display_name']
                if concept and subtopic: # only append information to lists if concepts and subconcepts have been found
                    publication_years.append(requests.get(url_openalex).json()['results'][i]['publication_year'])
                    open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
                    subconcept_names.append(subtopic)
                    concept_names.append(topic)
        except:
            continue
    
    # creating a new csv for every page in the json (100 entries), otherwiese could use "if page%10 == 0:""
    link_data = pd.DataFrame({'pdf_link':open_source_article_pdfs, 'year': publication_years, 'concepts': concept_names, 'subconcepts': subconcept_names})
    link_data.to_csv(f'../raw_data/link_data/page{page}_link_data.csv', index=False)

### Implementing Cursor Pagination into OpenAlex API
#### This is necessary because otherwise we're only allowed access to the first 10.000 entries.

In [None]:
# Filling a csv with results after page 100 with cursor pagination (ran in Colab with Google Drive)

filter_condition_oa = 'has_oa_accepted_or_published_version:true'
res_per_page = 100
cursor = '*'
url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&per-page={res_per_page}&cursor={cursor}'

#for i in range(1, 101): # making the cursor start at a certain page (if we've already requested the previous pages)
#    cursor = requests.get(url_openalex).json()['meta']['next_cursor']
#    url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&per-page={res_per_page}&cursor={cursor}'
#    print(f'cursor num {i}: {cursor}')
    
for page in range(1, 201):
    ids = []
    titles = []
    open_source_article_pdfs = []
    publication_years = []
    concept_names = []
    subconcept_names = []
    for i in range(res_per_page): # check whether the type is 'journal article' and 
        #whether it has a open access url pointing to a pdf and if it has topics or subtopics
        try:
            if (requests.get(url_openalex).json()['results'][i]['type'] == 'journal-article') \
            & (requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'].endswith('.pdf')):
                for concept in requests.get(url_openalex).json()['results'][i]['concepts']:
                    if concept['level'] == 0:
                        topic = concept['display_name']
                    elif concept['level'] == 1:
                        subtopic = concept['display_name']
                if concept and subtopic: # only append information to lists if concepts and subconcepts have been found
                    ids.append(requests.get(url_openalex).json()['results'][0]['id'])
                    titles.append(requests.get(url_openalex).json()['results'][0]['title'])
                    publication_years.append(requests.get(url_openalex).json()['results'][i]['publication_year'])
                    open_source_article_pdfs.append(requests.get(url_openalex).json()['results'][i]['open_access']['oa_url'])
                    subconcept_names.append(subtopic)
                    concept_names.append(topic)
        except:
            continue
        
    cursor = requests.get(url_openalex).json()['meta']['next_cursor']
    url_openalex = f'https://api.openalex.org/works?filter={filter_condition_oa}&per-page={res_per_page}&cursor={cursor}'
    print(f'cursor num {page}: {cursor}')
    
    # creating a new csv for every page in the json, otherwiese for bulking could use "if page%10 == 0:""
    link_data = pd.DataFrame({'id': ids, 'pdf_link':open_source_article_pdfs, 'title':titles, 'year': publication_years, 'concepts': concept_names, 'subconcepts': subconcept_names})
    link_data.to_csv(f'../raw_data/link_data_id/page{page}_link_data.csv', index=False)

# Concatenating Data into one DataFrame

In [3]:
# Make list of DataFrames and then concatenate them
joined_files = os.path.join('/home/stefanie/code/WorkingPaper/raw_data/link_data', '*.csv')

joined_list = glob.glob(joined_files)

joined_link_data = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
joined_link_data

Unnamed: 0,pdf_link,year,concepts,subconcepts
0,https://academic.oup.com/bioinformatics/articl...,2008,Computer science,World Wide Web
1,https://link.springer.com/content/pdf/10.1057%...,1977,Business,Marketing
2,https://research.tees.ac.uk/ws/files/5947237/5...,2009,Medicine,Physical therapy
3,https://academic.oup.com/jpart/article-pdf/16/...,2005,Political science,Public relations
4,http://jem.rupress.org/content/189/6/991.full.pdf,1999,Biology,Immunology
...,...,...,...,...
8154,http://case.edu/cse/eche/daigroup/Journal Arti...,2010,Materials science,Chemical engineering
8155,http://www.jbc.org/content/282/33/24131.full.pdf,2007,Biology,Cell biology
8156,https://www.dora.lib4ri.ch/eawag/islandora/obj...,2001,Chemistry,Environmental chemistry
8157,https://jamanetwork.com/journals/jama/articlep...,2007,Medicine,Internal medicine


In [4]:
# Quick checks on how data is distributed

joined_link_data.groupby(by='concepts').count()

Unnamed: 0_level_0,pdf_link,year,subconcepts
concepts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Art,2,2,2
Biology,2298,2298,2298
Business,122,122,122
Chemistry,478,478,478
Computer science,1339,1339,1339
Economics,201,201,201
Engineering,12,12,12
Environmental science,173,173,173
Geography,45,45,45
Geology,70,70,70


In [5]:
joined_link_data.groupby(by='subconcepts').count()

Unnamed: 0_level_0,pdf_link,year,concepts
subconcepts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Accounting,8,8,8
Acoustics,4,4,4
Actuarial science,10,10,10
Advertising,10,10,10
Aerospace engineering,2,2,2
...,...,...,...
Virology,93,93,93
Visual arts,2,2,2
Waste management,4,4,4
World Wide Web,52,52,52


In [6]:
joined_link_data.groupby(by='year').count()

Unnamed: 0_level_0,pdf_link,concepts,subconcepts
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1857,2,2,2
1879,1,1,1
1895,1,1,1
1900,2,2,2
1904,2,2,2
...,...,...,...
2017,91,91,91
2018,60,60,60
2019,47,47,47
2020,120,120,120


In [7]:
# Save to new csv file

joined_link_data.to_csv(f'../raw_data/link_data/joined_link_data_1-100.csv', index=False)