Skip to content

Commit

Permalink
refactor fetching of Wikipedia articles
Browse files Browse the repository at this point in the history
and support Wikidata IDs
  • Loading branch information
peterstadler committed Jan 25, 2023
1 parent 3556ba9 commit 5985d94
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 24 deletions.
26 changes: 8 additions & 18 deletions modules/app.xqm
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,7 @@ declare
case 'XML-Preview' return 'xml.html'
case 'examples' return if(gl:schemaIdent2docType($model?schemaID) = (for $func in $wdt:functions return $func(())('name'))) then 'examples.html' else ()
case 'wikipedia-article' return
if($model?gnd and exists(er:grab-external-resource-wikidata($model?gnd, 'gnd')//sr:binding[@name=('article' || upper-case($lang))]/sr:uri/data(.))) then 'wikipedia.html'
else if($model?viaf and exists(er:grab-external-resource-wikidata($model?viaf, 'viaf')//sr:binding[@name=('article' || upper-case($lang))]/sr:uri/data(.))) then 'wikipedia.html'
if(($model?doc//tei:idno | $model?doc//mei:altId) => er:wikipedia-article-url($lang)) then 'wikipedia.html'
else ()
case 'adb-article' return if($model?gnd and er:lookup-gnd-from-beaconProvider('adbBeacon', $model?gnd)) then 'adb.html' else ()
case 'ndb-article' return if($model?gnd and er:lookup-gnd-from-beaconProvider('ndbBeacon', $model?gnd)) then 'ndb.html' else ()
Expand Down Expand Up @@ -689,10 +688,9 @@ declare
:)

declare function app:place-details($node as node(), $model as map(*)) as map(*) {
let $geonames-id := str:normalize-space(($model?doc//tei:idno[@type='geonames'])[1])
let $gnd := query:get-gnd($model('doc'))
let $gn-doc := er:grabExternalResource('geonames', $geonames-id, ())
let $basic-data := app:place-basic-data($node, $model)
let $gnd := query:get-gnd($model('doc'))
let $gn-doc := er:grabExternalResource('geonames', $basic-data?geonames-id, ())
return
map:merge((
map {
Expand Down Expand Up @@ -1021,19 +1019,11 @@ declare
%templates:wrap
%templates:default("lang", "en")
function app:wikipedia($node as node(), $model as map(*), $lang as xs:string) as map(*) {
let $gnd := query:get-gnd($model('doc'))
let $viaf := if($gnd) then () else query:get-viaf($model('doc'))
let $wikiContent :=
if($gnd) then er:grabExternalResource('wikipedia', $gnd, $lang)
else er:grabExternalResource('wikipediaVIAF', $viaf, $lang)
let $wikiUrl := $wikiContent//xhtml:div[@class eq 'printfooter']/xhtml:a[1]/data(@href)
let $wikiName := normalize-space($wikiContent//xhtml:h1[@id = 'firstHeading'])
return
map {
'wikiContent' : $wikiContent,
'wikiUrl' : $wikiUrl,
'wikiName' : $wikiName
}
(: wikiUrl including the version info :)
(: let $wikiUrl := $wikiContent//xhtml:div[@class eq 'printfooter']/xhtml:a[1]/data(@href) :)
let $wikiUrl as xs:anyURI := ($model?doc//tei:idno | $model?doc//mei:altId) => er:wikipedia-article-url($lang)
return
er:wikipedia-article($wikiUrl, $lang)
};


Expand Down
53 changes: 51 additions & 2 deletions modules/external-requests.xqm
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ declare namespace schema="http://schema.org/";
declare namespace util="http://exist-db.org/xquery/util";
declare namespace request="http://exist-db.org/xquery/request";
declare namespace wikidata-property="http://www.wikidata.org/prop/direct/";
declare namespace xhtml="http://www.w3.org/1999/xhtml";

import module namespace config="http://xquery.weber-gesamtausgabe.de/modules/config" at "config.xqm";
import module namespace lang="http://xquery.weber-gesamtausgabe.de/modules/lang" at "lang.xqm";
Expand All @@ -43,6 +44,7 @@ declare function er:grabExternalResource($resource as xs:string, $id as xs:strin
let $botPresent := er:bot-present()
let $url :=
switch($resource)
(: the first two are deprecated and are to be replaced by `er:wikipedia-article#2` :)
case 'wikipediaVIAF' return (er:grab-external-resource-wikidata($id, 'viaf')//sr:binding[@name=('article' || upper-case($lang))]/sr:uri/data(.))[1]
case 'wikipedia' return (er:grab-external-resource-wikidata($id, 'gnd')//sr:binding[@name=('article' || upper-case($lang))]/sr:uri/data(.))[1]
case 'dnb' return concat('https://d-nb.info/gnd/', $id, '/about/rdf')
Expand Down Expand Up @@ -77,7 +79,9 @@ declare function er:grab-external-resource-via-beacon($beaconProvider as xs:stri
: see http://expath.org/modules/http-client/
:)
declare function er:grab-external-resource-wikidata($id as xs:string, $authority-provider as xs:string) as element(er:response)? {
let $uri := er:wikidata-url($id, $authority-provider)
let $uri :=
if($authority-provider eq 'wikidata') then xs:anyURI('http://www.wikidata.org/entity/' || $id || '.rdf')
else er:wikidata-url($id, $authority-provider)
let $fileName := util:hash($uri, 'md5') || '.xml'
return
if(er:bot-present() or not($uri))
Expand Down Expand Up @@ -319,7 +323,7 @@ declare function er:viaf2gnd($viaf as xs:string) as xs:string* {
declare function er:translate-authority-id($idno as element(), $to as xs:string) as xs:string* {
let $wikidata :=
switch($idno/@type)
case 'wikidata' return er:cached-external-request(xs:anyURI('http://www.wikidata.org/entity/' || string($idno) || '.rdf'), str:join-path-elements(($config:tmp-collection-path, 'wikidata', string($idno) || '.xml')))
case 'wikidata' return er:cached-external-request(er:grab-external-resource-wikidata(string($idno), 'wikidata'), str:join-path-elements(($config:tmp-collection-path, 'wikidata', string($idno) || '.xml')))
case 'gnd' case 'viaf' case 'geonames' return er:grab-external-resource-wikidata(string($idno), $idno/@type)
default return ()
return (
Expand Down Expand Up @@ -359,3 +363,48 @@ declare function er:beacon-map($gnd as xs:string, $docType as xs:string) as map(
)
else map {}
};

(:~
: Fetch Wikipedia article URL via any authority ID
: Internally, Wikidata is recursively queried for the given IDs until a match is returned
:
: @param $idno a sequence of either tei:idno or mei:altId elements
: @param $lang the language variable (de|en)
: @return a sequence of URLs to Wikipedia articles (usually that should be only one article URL, though)
:)
declare function er:wikipedia-article-url($idno as element()*, $lang as xs:string) as xs:anyURI* {
let $supported-authority-providers := ('gnd', 'viaf', 'wikidata', 'geonames')
let $ids := $idno[self::tei:idno or self::mei:altId][@type=$supported-authority-providers] => sort() (: sort items for reproducability :)
let $cur-id := $ids[1]
let $url :=
switch($cur-id/@type => string())
case 'wikidata' return (er:grab-external-resource-wikidata($cur-id, 'wikidata')//rdf:Description[starts-with(@rdf:about, 'https://de.wikipedia.org/wiki')][rdf:type/@rdf:resource="http://schema.org/Article"]/@rdf:about)
case '' return ()
default return (er:grab-external-resource-wikidata($cur-id, $cur-id/@type)//sr:binding[@name=('article' || upper-case($lang))]/sr:uri/data(.))
return
if ($url) then $url
else if(count($ids) gt 1) then er:wikipedia-article-url(subsequence($ids, 2), $lang)
else ()
};

(:~
: Fetch Wikipedia article
:
: @param $wikiUrls the URLs of the Wikipedia articles (see `er:wikipedia-article-url#2`)
: @param $lang the language variable (de|en)
: @return a sequence of map objects with the keys 'wikiContent', 'wikiUrl', and 'wikiName'
:)
declare function er:wikipedia-article($wikiUrls as xs:anyURI*, $lang as xs:string) as map(*)* {
for $wikiUrl in $wikiUrls
let $fileName := util:hash($wikiUrl, 'md5') || '.xml'
let $wikiContent :=
if($wikiUrl) then er:cached-external-request($wikiUrl, str:join-path-elements(($config:tmp-collection-path, 'wikipedia', $fileName)))
else ()
let $wikiName := normalize-space($wikiContent//xhtml:h1[@id = 'firstHeading'])
return
map {
'wikiContent' : $wikiContent,
'wikiUrl' : $wikiUrl,
'wikiName' : $wikiName
}
};
6 changes: 2 additions & 4 deletions modules/img.xqm
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,8 @@ declare %private function img:wikidata-images($model as map(*), $lang as xs:stri
: @return
:)
declare %private function img:wikipedia-images($model as map(*), $lang as xs:string) as map(*)* {
let $gnd := query:get-gnd($model('doc'))
let $wikiArticle :=
if($gnd) then er:grabExternalResource('wikipedia', $gnd, $lang)
else ()
let $wikiModel := ($model?doc//tei:idno | $model?doc//mei:altId) => er:wikipedia-article-url($lang) => er:wikipedia-article($lang)
let $wikiArticle := $wikiModel?wikiContent
(: Look for images in wikipedia infobox (for organizations and english wikipedia) and thumbnails :)
let $images := $wikiArticle//xhtml:img[@class='thumbimage' or ancestor::xhtml:table[contains(@class, 'vcard') or contains(@class, 'toptextcells')] or ancestor::xhtml:div[@class='thumbinner']]
return
Expand Down

0 comments on commit 5985d94

Please sign in to comment.