# Exploring the approach to get the download links from our taverne repository for our publications

Just run everything top to bottom

---

# Libraries

In [1]:
# !pip install lxml
# !pip install beautifulsoup4

In [1]:
import requests
# import lxml
from bs4 import BeautifulSoup
import unicodedata
import bibtexparser
from bibreader import parse_bibtex_file, get_bib_blocks
import latexcodec
import codecs
from difflib import SequenceMatcher

# DIAG bib

### Download diag.bib and fullstrings.bib
I downloaded them like this so you dont have to do this manually, not sure if this always works

In [2]:
download_bibs = False #set to True if you want to download the diag.bib and fullstrings.bib

diag.bib

In [3]:
diag_bib_raw_url = 'https://raw.githubusercontent.com/DIAGNijmegen/Literature/main/diag.bib'
r_diag_bib_raw = requests.get(diag_bib_raw_url).text

In [4]:
if download_bibs:
    with open("diag.bib", 'w', encoding="utf-8-sig") as file:
        file.write(r_diag_bib_raw)

fullstrings.bib

In [5]:
fullstrings_raw_url = 'https://raw.githubusercontent.com/DIAGNijmegen/Literature/main/fullstrings.bib'
r_fullstrings_raw = requests.get(fullstrings_raw_url).text 

In [6]:
if download_bibs:
    with open("fullstrings.bib", 'w', encoding="utf-8-sig") as file:
        file.write(r_fullstrings_raw)

### Load bibfile with website-content's bibreader modelule

In [7]:
diag_bib_path = 'diag.bib' #r'C:\Users\joeyspronck\Downloads\diag.bib'
fullstrings_path = 'fullstrings.bib' #r'C:\Users\joeyspronck\Downloads\fullstrings.bib'

diag_bib = parse_bibtex_file(diag_bib_path, fullstrings_path)
len(diag_bib)

no year found in bibitem. skipping bibitem: {'type': 'string', 'pmidnumber': -1}
no year found in bibitem. skipping bibitem: {'type': 'comment', 'pmidnumber': -1}


1614

---

---

# Example publication

In [8]:
example_title='Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer'
example_title

'Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer'

In [9]:
example_link='https://repository.ubn.ru.nl/handle/2066/235298'

In [10]:
example_search_url = 'https://repository.ubn.ru.nl/discover?query=Identification+of+Risk+of+Cardiovascular+Disease+by+Automatic+Quantification+of+Coronary+Artery+Calcifications+on+Radiotherapy+Planning+CT+Scans+in+Patients+With+Breast+Cancer&scope='
example_search_url

'https://repository.ubn.ru.nl/discover?query=Identification+of+Risk+of+Cardiovascular+Disease+by+Automatic+Quantification+of+Coronary+Artery+Calcifications+on+Radiotherapy+Planning+CT+Scans+in+Patients+With+Breast+Cancer&scope='

# Get example bib entry

In [11]:
example_title='Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer'
hits = []
for k, v in diag_bib.items():
    if example_title in str(v):
        hits.append(k)
hits

['gal21']

perfect match in this case

In [12]:
example_bib = diag_bib[hits[0]]

## Get title and DOI

### Title

In [13]:
example_bib_title = example_bib['title']
example_bib_title

'Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer'

### DOI

In [14]:
example_bib_doi = example_bib['doi']
example_bib_doi

'https://doi.org/10.1001/jamaoncol.2021.1144'

---

---

# Search publication in repository

### Recreate search url from tilte

In [16]:
plus_title = example_bib_title.strip().replace(' ', '+')
plus_title

'Identification+of+Risk+of+Cardiovascular+Disease+by+Automatic+Quantification+of+Coronary+Artery+Calcifications+on+Radiotherapy+Planning+CT+Scans+in+Patients+With+Breast+Cancer'

In [17]:
search_url=f'https://repository.ubn.ru.nl/discover?query={plus_title}&scope='
print(search_url)
search_url == example_search_url

https://repository.ubn.ru.nl/discover?query=Identification+of+Risk+of+Cardiovascular+Disease+by+Automatic+Quantification+of+Coronary+Artery+Calcifications+on+Radiotherapy+Planning+CT+Scans+in+Patients+With+Breast+Cancer&scope=


True

### Search, request and soup

In [18]:
r_search = requests.get(search_url)
bs_search = BeautifulSoup(r_search.text, 'lxml')

### Get search results

In [19]:
search_results = bs_search.find('div', id='aspect_discovery_SimpleSearch_div_search-results')
results = search_results.find_all('div', class_='artifact-description')

## First result

In [20]:
first_result = results[0]

### Slug

In [21]:
search_result_slug = first_result.a['href']
search_result_slug

'/handle/2066/235298'

### Title

In [22]:
search_result_title = first_result.a.h4.text.strip()
search_result_title

'Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer'

We could set a threshold here

In [23]:
a = example_bib_title.strip().lower()
b = search_result_title.strip().lower()
title_match_ratio = SequenceMatcher(a=a, b=b).ratio()
title_match_ratio

1.0

In [24]:
title_match_ratio > 0.9

True

---

---

# Check out first result link

### Recreate publication link

In [25]:
repo_url_base = "https://repository.ubn.ru.nl"

In [26]:
best_match_url = repo_url_base+search_result_slug
best_match_url

'https://repository.ubn.ru.nl/handle/2066/235298'

Check

In [27]:
best_match_url == example_link

True

### Best match, request and soup

In [28]:
r_best_match = requests.get(best_match_url)
bs_best_match = BeautifulSoup(r_best_match.text, 'lxml')

### DOI

In [29]:
best_match_doi_div = bs_best_match.find('div', class_='simple-item-view-doi')
best_match_doi = best_match_doi_div.a['href']
best_match_doi

'https://doi.org/10.1001/jamaoncol.2021.1144'

Check

In [32]:
example_bib_doi == best_match_doi

True

---

---

# Combine everything

In [15]:
def get_publication_page_url(title, doi):
    # Search
    plus_title = title.strip().replace(' ', '+')
    search_url=f'https://repository.ubn.ru.nl/discover?query={plus_title}&scope='
    r_search = requests.get(search_url)
    bs_search = BeautifulSoup(r_search.text, 'lxml')
    search_results = bs_search.find('div', id='aspect_discovery_SimpleSearch_div_search-results')
    results = search_results.find_all('div', class_='artifact-description')
    
    # First result
    first_result = results[0]
    search_result_slug = first_result.a['href']
    search_result_title = first_result.a.h4.text.strip()
    a = example_bib_title.strip().lower()
    b = search_result_title.strip().lower()
    # TITLE RATIO
    title_match_ratio = SequenceMatcher(a=a, b=b).ratio()
    
    # First result page
    repo_url_base = "https://repository.ubn.ru.nl"
    best_match_url = repo_url_base+search_result_slug
    r_best_match = requests.get(best_match_url)
    bs_best_match = BeautifulSoup(r_best_match.text, 'lxml')
    best_match_doi_div = bs_best_match.find('div', class_='simple-item-view-doi')
    # DOI
    best_match_doi = best_match_doi_div.a['href']

    if title_match_ratio > 0.9 and doi == best_match_doi:
        return best_match_url
    else:
        return None

In [16]:
print(example_bib_title)
print(example_bib_doi)

Identification of Risk of Cardiovascular Disease by Automatic Quantification of Coronary Artery Calcifications on Radiotherapy Planning CT Scans in Patients With Breast Cancer
https://doi.org/10.1001/jamaoncol.2021.1144


In [17]:
get_publication_page_url(example_bib_title, example_bib_doi)

'https://repository.ubn.ru.nl/handle/2066/235298'

---

---

---

---

---

---

---

---

# Other stuff

### Authors

In [133]:
authors = bs_link.find('div', class_='simple-item-view-authors')
authors

<div class="simple-item-view-authors item-page-field-wrapper table">
<h5>Author(s)</h5>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=633af101fe822e2bef176974b4c9394fe68539b6">Gal, R.</a>
</div>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=00622b33ee1f92cc394a320e5cbb8eee0c9c9410">Velzen, S.G.M. van</a>
</div>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=9a539b16085b4ce51bba03a15a3340068dca2a90">Hooning, M.J.</a>
</div>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=068bcf5e4f507c3a49c4955588de6ccd5e14f7fc">Emaus, M.J.</a>
</div>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=a8ee98f8aa2324c0d030444bf12448b1">Leij, F. van der</a>
</div>
<div class="ds-dc_contributor_author-authority">
<a href="/browse?type=author&amp;authority=dea26ba664477987f484700fd9b50c90d

In [143]:
authors_list = [author.text.strip() for author in bs_link.find_all('div', class_='ds-dc_contributor_author-authority')]
authors_list

['Gal, R.',
 'Velzen, S.G.M. van',
 'Hooning, M.J.',
 'Emaus, M.J.',
 'Leij, F. van der',
 'Gregorowitsch, M.L.',
 'Blezer, E.L.A.',
 'Gernaat, S.A.M.',
 'Lessmann, N.',
 'Sattler, M.G.',
 'Leiner, T.',
 'Jong, Pa. de',
 'Teske, A.J.',
 'Verloop, J.',
 'Penninkhof, J.J.',
 'Vaartjes, I.',
 'Meijer, H.J.M.',
 'Tol-Geerdink, J.J. van',
 'Pignol, J.P.',
 'Bongard, D. van den',
 'Isgum, I.',
 'Verkooijen, H.M.']

In [144]:
surnames_list = [author.split(',')[0] for author in authors_list]
surnames_list

['Gal',
 'Velzen',
 'Hooning',
 'Emaus',
 'Leij',
 'Gregorowitsch',
 'Blezer',
 'Gernaat',
 'Lessmann',
 'Sattler',
 'Leiner',
 'Jong',
 'Teske',
 'Verloop',
 'Penninkhof',
 'Vaartjes',
 'Meijer',
 'Tol-Geerdink',
 'Pignol',
 'Bongard',
 'Isgum',
 'Verkooijen']